In [5]:
#crawl through imdb

#import some handy packages
from bs4 import BeautifulSoup
from urllib.request import urlopen
import time
import random
import re
import csv

In [6]:
#define function to process movies
def parsemovie(soup,writer):
  #get movie title, which is string stored under html 'title' tag, & strip out any whitespace at the start/end; then clean out IMDB text
  movie = soup.title.string.strip(); movie = movie.encode('utf-8', 'ignore'); movie = movie.decode('utf-8'); movie = re.sub(' - IMDB', '', movie, flags=re.IGNORECASE)
  m = re.search('(\d\d\d\d)', movie)
  if m:
    year = m.group(1); year=int(year)
  else:
    year = ''
  #get movie url
  tag = soup.find("link", rel="canonical"); movie_url = tag["href"]
  #find the table that contains the cast list
  table = soup.find("table", class_="cast_list")
  #find all the rows from the table and create a row-counter
  rows = table.find_all("tr"); rowct=0
  #print(len(rows),"rows in the table")
  #iterate thru rows of the table
  for row in rows:
    #turn the next line one to print each row
    #print(row)
    #find all the columns in the row
    cols = row.find_all("td")
    #skip this row if it doesn't contain more than one column
    if not (len(cols)>1): continue
    #create a list to hold our data
    rec = []
    #add movie title & url to our list, which we retrieved earlier
    rec.append(movie); rec.append(movie_url)
    #increment row-counter
    rowct+=1
    #iterate thru columns to store actors and roles
    for col in cols:
      #get link stored within the cell (html "a" tag)
      link = col.find("a")
      #try/except allows python to skip a sequence if it doesn't work, rather than fail
      try:
        #get the text stored under the link
        txt = link.text
        #proceed if text contains alphanumeric values (note: uses regex package called re)
        if re.search('[a-zA-Z0-9]', txt):
          #process the text to strip out any whitespace at the start/end, encode it to resolve any unicode issues, and add it to our list
          txt = txt.strip(); txt = txt.encode('utf-8', 'ignore'); txt = txt.decode('utf-8'); rec.append(txt)
          #get url from link, which is the value stored under the key 'href', clean it up, & add to the end of our list
          url = link["href"]; url = re.sub('\?ref.+','',url); url='http://www.imdb.com' + url; rec.append(url)
      except:
        pass
    #add the year to our list
    rec.append(year)
    #send our list to the output handle
    writer.writerow(rec)
  results = [movie,rowct,movie_url]
  return results

In [7]:
#define function to process actors
def parseactor(soup,writer):
  #get actor name, which is string stored under html 'title' tag, encode, & strip out any whitespace at the start/end; then clean out IMDB text
  actor = soup.title.string.strip(); actor = actor.encode('utf-8', 'ignore'); actor = actor.decode('utf-8'); actor = re.sub(' - IMDB','',actor, flags=re.IGNORECASE)
  #get actor url
  tag = soup.find("link", rel="canonical"); actor_url = tag["href"]
  #find the tag at the start of the list of acting credits
  div = soup.find("div", class_="filmo-category-section")
  #find all the credits nested under this tag
  credits = div.find_all("div", class_="filmo-row")
  print(len(credits),"credits in filmography")
  goodct=0
  #iterate thru list of credits
  for row in credits:
    txt = row.text.strip(); txt = txt.encode('utf-8', 'ignore'); txt = txt.decode('utf-8'); #print(txt)
    if '(TV' in txt:
      continue
    yrsect = row.span.text.strip()
    m = re.search('(\d\d\d\d)', yrsect)
    if m:
      year = m.group(1); year=int(year)
    else:
      year = ''
    #create a list to hold our data
    links = row.find_all("a")
    ct=len(links)
    if ct != 2:
      continue
    #iterate thru columns to store actors and roles
    ctr=0
    for link in links:
      txt = link.text.strip(); txt = txt.encode('utf-8', 'ignore'); txt = txt.decode('utf-8');
      url = link["href"]; url = re.sub('\?ref.+','',url); url='http://www.imdb.com' + url
      ctr+=1
      if ctr==1:
        movie_url=url; movie=txt
      else:
        role_url=url; role=txt
    rec = [movie,movie_url,actor,actor_url,role,role_url,year]
    #send our list to the output handle
    writer.writerow(rec); goodct+=1
  results = [actor,goodct,actor_url]
  return results

In [8]:
#input previous work and set up file outputs

#input past results, store in dictionary called pgsdone
pgsdone = {}
with open('imdb-pgs.txt', 'rb') as infile:
  for line in infile:
    line = line.strip(); pgsdone[line]=1
print(len(pgsdone),'pgs already done')

#input collected records
movies = {}; actors = {}; characters = {}; allurls = {}; ctr=0
with open('imdb-recs.txt', 'r') as infile:
  for line in infile:
    ctr+=1
    if ctr==1: continue
    (movie,movie_url,actor,actor_url,character,character_url,year) = line.split('\t')
    movies[movie_url]=movie; actors[actor_url]=actor; characters[character_url]=character
    allurls[movie_url]=movie; allurls[actor_url]=actor; allurls[character_url]=character
print(len(movies),'movies,',len(actors),'actors, and',len(characters),'characters in dataset')

#setup three output files (fbak for storing websites collected, fout for storing data scraped, fout2 for recording pages scraped)
#note the flags are set to 'ab' for append rather than 'wb' for write, which would erase previously collected data
fbak = open('src-pgs.txt', 'ab')
fout = open('imdb-recs.txt', 'a', newline='')
fout2 = open('imdb-pgs.txt', 'a')
#create a handle to write tab-delimited data to the file fout
writer = csv.writer(fout, delimiter='\t')

1 pgs already done
1 movies, 15 actors, and 14 characters in dataset


In [9]:
#iterate through urls to be collected

urlct=0
for url in allurls:
  #skip urls that have already been collected
  if url in pgsdone:
    continue
  #determine the type of url, and skip if don't recognize it (or if character b/c parser for this type of page doesn't exist yet)
  if url in movies:
    type = 'movie'
  elif url in actors:
    type = 'actor'
  elif url in characters:
    type = 'character'
    continue
  else:
    print("warn: don't recognize url type for",url)
    continue

  #keep track of the number of urls scraped and quit after a certain number (by breaking out of the loop)
  urlct+=1
  if urlct>2:
    break
  print("\nworking on:",url,"  type:",type)

  #open the url, store it in back up file, and sleep for a random number of seconds
  html = urlopen(url).read(); fbak.write(html);
  time.sleep(random.randint(4, 10)); #note: sleeping is essential to avoid annoying web servers by calling them too often!

  #convert the html into a format that recognizes the structure of the html (and put it into a variable we're calling soup)
  soup = BeautifulSoup(html, "html.parser")

  if type=='movie':
    #send parsed html and output handle to movie parser function
    (movie,goodct,movie_url)=parsemovie(soup,writer)
    #report how many good rows were found
    print('parsed',movie)
    print('actors found:',goodct)
    fout2.write(movie_url + "\n")
  elif type=='actor':
    #send parsed html and output handle to actor parser function
    (actor,goodct,actor_url)=parseactor(soup,writer)
    #report how many good rows were found
    print('parsed',actor)
    print('credits found:',goodct)
    fout2.write(actor_url + "\n")


working on: http://www.imdb.com/title/tt0451279/   type: movie
parsed Wonder Woman (2017)
actors found: 15

working on: http://www.imdb.com/name/nm2933757/   type: actor
19 credits in filmography
parsed Gal Gadot
credits found: 11


In [10]:
#close file outputs
fbak.close()
fout.close()
fout2.close()